
###################################################
##### Haiti elite network project      		  	
##### cleaning Haiti BM and Orbis ownership data
##### 2014 oct 05								            	
###################################################

## read in agemar trade data
## make dummies for products where we know families and are in CPI
## make grid of all family-product combos
## graph relationship between 2009 and 2011 shipments


age <- read.csv("01_Data/02_Clean/agemar_ha.csv")
age <- subset(age, age$year!=2010)


#####
## fix a few duplicated names by hand
#####

temp <- data.table(age)

# temp$con_final[temp$con_final=="V & F (VORBE & FILS) CONSTRUCTION"] <- "VORBE & FILS CONSTRUCTION"
# temp$con_final[temp$con_final=="TASHA STORE"] <- "TASHA FOOD"
# temp$con_final[temp$con_final=="SUN AUTO"] <- "SUNAUTO"
# temp$con_final[temp$con_final=="SDG"] <- "SOCIETE DE DISTRIBUTION GENERALE"
# temp$con_final[temp$con_final=="NABATCO"] <- "NATIONAL BAG & TRADING"
# temp$con_final[temp$con_final=="MSC"] <- "MSC TRADING"
# temp$con_final[temp$con_final=="INTERNATIONAL TRADERS"] <- "HIT INTERNATIONAL TRADERS"
# temp$con_final[temp$con_final=="ADNAMA DISTRIBUTION IMPORT-EXPORT"] <- "ADNAMA DISTRIBUTION"
# temp$con_final[temp$con_final=="A & B CENTRE D'AFFAIRES ACHATS & VENTE"] <- "A & B HARDWARE"

temp <- subset(temp, temp$con_final!="PARTICULIER" & temp$con_final!="NON DISPONIBLE")


## calculate annual prices
us <- read.csv("01_Data/02_Clean/prices_all_month.csv")
us <- data.table(us)
us <- us[,lapply(.SD, FUN = mean, na.rm=T), by = c('hs_four', 'year'),
         .SDcols = c('price_ha', 'price_wo')]
us <- subset(us, us$year==2009 | us$year==2011)
us$price_ha <- ifelse(us$price_ha=='NaN', NA, us$price_ha)
us$price_wo <- ifelse(us$price_wo=='NaN', NA, us$price_wo)
us <- reshape(us, idvar='hs_four', timevar = 'year', dir = 'wide')


## calculate qty and wgt per year
temp2 <- temp[,lapply(.SD, FUN = sum, na.rm = T), by = c('con_final', 'hs_four', 'year'),
              .SDcols = c('qty', 'wgt_kg')]
temp2 <- as.data.frame(temp2)


## reshape to long and set zeros
temp2 <- reshape(temp2, idvar = c('con_final', 'hs_four'), dir = 'wide', timevar = 'year')
vars = c('qty.2009', 'qty.2011', 'wgt_kg.2009', 'wgt_kg.2011')
for (i in 1:length(vars)){
  temp2[,vars[i]] <- ifelse(is.na(temp2[,vars[i]])==T, 0, temp2[,vars[i]])
}


## merge in price data
temp <- merge(temp2, us, by = 'hs_four', all.x = T)


## calculate value
temp$value.2009 <- temp$qty.2009*temp$price_wo.2009
temp$value.2011 <- temp$qty.2011*temp$price_wo.2011


## delete unavailable comps and hscodes
temp <- subset(temp, temp$con_final!="NON DISPONIBLE" & is.na(temp$hs_four)==F)


## make dummy for family data
fam <- read.dta('01_Data/02_Clean/fam_biz_prod.dta')
fam <- fam[fam$fam!="NA",]
fam <- data.frame('con_final' = unique(fam$con_final), 'fam_known' = 1)
temp$fam_known <- ifelse(temp$con_final %in% fam$con_final, 1, 0)


## make dummy for CPI products
cpi <- read.csv('01_Data/02_Clean/cpiprod.csv')
cpi$hscode <- sprintf("%04d", cpi$hscode)
temp$hs_four <- sprintf("%04s", temp$hs_four)
temp$cpiprod <- ifelse(temp$hs_four %in% cpi$hscode, 1, 0)


## make logs
temp$qty_log.2009 <- log(temp$qty.2009 + 1)
temp$qty_log.2011 <- log(temp$qty.2011 + 1)
temp$value_log.2009 <- log(temp$value.2009 + 1)
temp$value_log.2011 <- log(temp$value.2011 + 1)
temp$wgt_kg_log.2009 <- log(temp$wgt_kg.2009 + 1)
temp$wgt_kg_log.2011 <- log(temp$wgt_kg.2011 + 1)


## make table of proportion of firms that import in one or both years
table(temp$value.2009!=0, temp$value.2011!=0)/sum(table(temp$value.2009!=0, temp$value.2011!=0))
temp2 <- subset(temp, temp$fam_known==1)
table(temp2$value.2009!=0, temp2$value.2011!=0)/sum(table(temp2$value.2009!=0, temp2$value.2011!=0))
temp2 <- subset(temp, temp$cpiprod==1)
table(temp2$value.2009!=0, temp2$value.2011!=0)/sum(table(temp2$value.2009!=0, temp2$value.2011!=0))


#####
## calculate by product the proportion of 2009 shipments that are continued in 2011
#####

temp2$product <- ifelse(temp2$hs_four=='2402', 'Cigarettes',
                        ifelse(temp2$hs_four=="2202", "Cola", 
                               ifelse(temp2$hs_four=="1507" | temp2$hs_four=="1508" | temp2$hs_four=="1509", "Huile comestible",
                                      ifelse(temp2$hs_four=="2710", "Kerosene",
                                             ifelse(temp2$hs_four=="0402", "Lait evapore non sucre",
                                                    ifelse(temp2$hs_four=="1103", "Mais moulu",
                                                           ifelse(temp2$hs_four=="3003" | temp2$hs_four=="3004", "Medicaments",
                                                                  ifelse(temp2$hs_four=="9401" | temp2$hs_four=="9404", "Meubles de salon",
                                                                         ifelse(temp2$hs_four=="1905", "Pain",
                                                                                ifelse(temp2$hs_four=="0713", "Pois sec",
                                                                                       ifelse(temp2$hs_four=="0303" | temp2$hs_four=="305", "Poisson frais",
                                                                                              ifelse(temp2$hs_four=="0207", "Poulet",
                                                                                                     ifelse(temp2$hs_four=="1006", "Riz",
                                                                                                            ifelse(temp2$hs_four=="6405", "Sandales",
                                                                                                                   ifelse(temp2$hs_four=="3402", "Savon de lessive",
                                                                                                                          ifelse(temp2$hs_four=="3303" | temp2$hs_four=="3304", "Soins esthetiques",
                                                                                                                                 ifelse(temp2$hs_four=="1701", "Sucre brut",
                                                                                                                                        ifelse(temp2$hs_four=="5212", "Tissus", NA))))))))))))))))))
temp3 <- data.table(temp2)

temp3$wgt_retain.2011 <- ifelse(temp3$wgt_kg.2011 > temp3$wgt_kg.2009, temp3$wgt_kg.2009, temp3$wgt_kg.2011)
temp3$qty_retain.2011 <- ifelse(temp3$qty.2011 > temp3$qty.2009, temp3$qty.2009, temp3$qty.2011)

temp3 <- temp3[,lapply(.SD, FUN = mean, na.rm=T), by = 'product', 
               .SDcols = c('qty.2009', 'wgt_kg.2009', 'qty_retain.2011', 'wgt_retain.2011')]
temp3$wgt_retain <- temp3$wgt_retain.2011 / temp3$wgt_kg.2009
temp3$qty_retain <- temp3$qty_retain.2011 / temp3$qty.2009

write.csv(temp3, '01_Data/03_Outputs/product_0911_retention.csv')


#####
## create grid of all combos
#####

## create grid of all combos of shippers and products
grid <- expand.grid('con_final' = unique(temp$con_final), 'hs_four' = unique(temp$hs_four))
grid <- merge(grid, temp, by = c('con_final', 'hs_four'), all.x = T)


## set zeros
vars = c('qty.2009', 'qty.2011', 'wgt_kg.2009', 'wgt_kg.2011', 'value.2009', 'value.2011')
for (i in 1:length(vars)){
  grid[,vars[i]] <- ifelse(is.na(grid[,vars[i]])==T, 0, grid[,vars[i]])
}


## make dummy for family data and CPI products
grid$fam_known <- ifelse(grid$con_final %in% fam$con_final, 1, 0)
grid$cpiprod <- ifelse(grid$hs_four %in% cpi$hscode, 1, 0)


## make logs
grid$qty_log.2009 <- log(grid$qty.2009 + 1)
grid$qty_log.2011 <- log(grid$qty.2011 + 1)
grid$value_log.2009 <- log(grid$value.2009 + 1)
grid$value_log.2011 <- log(grid$value.2011 + 1)


## look at tables of proportions
temp2 <- subset(grid, !(grid$qty.2009==0 & grid$qty.2011==0))
table(temp2$qty.2009!=0, temp2$qty.2011!=0)/sum(table(temp2$qty.2009!=0, temp2$qty.2011!=0))
temp2 <- subset(grid, grid$fam_known==1 & !(grid$qty.2009==0 & grid$qty.2011==0))
table(temp2$qty.2009!=0, temp2$qty.2011!=0)/sum(table(temp2$qty.2009!=0, temp2$qty.2011!=0))
temp2 <- subset(grid, grid$fam_known==1 & grid$cpiprod==1 & !(grid$qty.2009==0 & grid$qty.2011==0))
table(temp2$qty.2009!=0, temp2$qty.2011!=0)/sum(table(temp2$qty.2009!=0, temp2$qty.2011!=0))


## set cutpoint
cut = 0.99




#####
## graphs - continuity 2009-2011
#####

## value

jpeg('03_Figures/grid_value_0911_log.jpg')

lim2009 = quantile(grid$value_log.2009, na.rm = T, 1)
lim2011 = quantile(grid$value_log.2011, na.rm = T, 1)
mod = summary(lm(value_log.2011 ~ value_log.2009, data = grid))

ggplot(grid, aes(x = value_log.2009, y = value_log.2011)) +
  geom_point() +
  geom_smooth(method = 'lm', se = F) +
  annotate(geom = 'text', x = (lim2009/6), y = (lim2011 - lim2011/10),
           label = paste0('beta = ', round(mod$coefficients[2,1], 2), '
p-value = ', round(mod$coefficients[2,4], 2)), size = 8) +
  theme_minimal() +
  theme(text = element_text(size=16)) +
  labs(x = "Value - 2009", y = "Value - 2011")

dev.off()



jpeg('03_Figures/grid_value_0911_log_fam.jpg')

grid2 <- subset(grid, grid$fam_known==1)

lim2009 = quantile(grid2$value_log.2009, na.rm = T, 1)
lim2011 = quantile(grid2$value_log.2011, na.rm = T, 1)
mod = summary(lm(value_log.2011 ~ value_log.2009, data = grid2))

ggplot(grid2, aes(x = value_log.2009, y = value_log.2011)) +
  geom_point() +
  geom_smooth(method = 'lm', se = F) +
  annotate(geom = 'text', x = (lim2009/6), y = (lim2011 - lim2011/10),
           label = paste0('beta = ', round(mod$coefficients[2,1], 2), '
p-value = ', round(mod$coefficients[2,4], 2)), size = 8) +
  theme_minimal() +
  theme(text = element_text(size=16)) +
  labs(x = "Value - 2009", y = "Value - 2011") 

dev.off()


jpeg('03_Figures/grid_value_0911_log_cpi.jpg')

grid3 <- subset(grid, grid$cpiprod==1)

lim2009 = quantile(grid3$value_log.2009, na.rm = T, 1)
lim2011 = quantile(grid3$value_log.2011, na.rm = T, 1)
mod = summary(lm(value_log.2011 ~ value_log.2009, data = grid3))

ggplot(grid3, aes(x = value_log.2009, y = value_log.2011)) +
  geom_point() +
  geom_smooth(method = 'lm', se = F) +
  annotate(geom = 'text', x = (lim2009/6), y = (lim2011 - lim2011/10),
           label = paste0('beta = ', round(mod$coefficients[2,1], 2), '
p-value = ', round(mod$coefficients[2,4], 2)), size = 8) +
  theme_minimal() +
  theme(text = element_text(size=16)) +
  labs(x = "Value - 2009", y = "Value - 2011") 

dev.off()


## quantity

jpeg('03_Figures/grid_qty_0911_log.jpg')

lim2009 = quantile(grid$qty_log.2009, na.rm = T, 1)
lim2011 = quantile(grid$qty_log.2011, na.rm = T, 1)
mod = summary(lm(qty_log.2011 ~ qty_log.2009, data = grid))

ggplot(grid, aes(x = qty_log.2009, y = qty_log.2011)) +
  geom_point() +
  geom_smooth(method = 'lm', se = F) +
  annotate(geom = 'text', x = (lim2009/6), y = (lim2011 - lim2011/10),
           label = paste0('beta = ', round(mod$coefficients[2,1], 2), '
p-value = ', round(mod$coefficients[2,4], 2)), size = 8) +
  theme_minimal() +
  theme(text = element_text(size=16)) +
  labs(x = "Quantity - 2009", y = "Quantity - 2011")

dev.off()



jpeg('03_Figures/grid_qty_0911_log_fam.jpg')

grid2 <- subset(grid, grid$fam_known==1)

lim2009 = quantile(grid2$qty_log.2009, na.rm = T, 1)
lim2011 = quantile(grid2$qty_log.2011, na.rm = T, 1)
mod = summary(lm(qty_log.2011 ~ qty_log.2009, data = grid2))

ggplot(grid2, aes(x = qty_log.2009, y = qty_log.2011)) +
  geom_point() +
  geom_smooth(method = 'lm', se = F) +
  annotate(geom = 'text', x = (lim2009/8), y = (lim2011 - lim2011/10),
           label = paste0('beta = ', round(mod$coefficients[2,1], 2), '
                          p-value = ', round(mod$coefficients[2,4], 2)), size = 8) +
  theme_minimal() +
  theme(text = element_text(size=16)) +
  labs(x = "Quantity - 2009", y = "Quantity - 2011")

dev.off()


jpeg('03_Figures/grid_qty_0911_log_cpi.jpg')

lim2009 = quantile(grid3$qty_log.2009, na.rm = T, 1)
lim2011 = quantile(grid3$qty_log.2011, na.rm = T, 1)
mod = summary(lm(qty_log.2011 ~ qty_log.2009, data = grid3))

ggplot(grid3, aes(x = qty_log.2009, y = qty_log.2011)) +
  geom_point() +
  geom_smooth(method = 'lm', se = F) +
  annotate(geom = 'text', x = (lim2009/6), y = (lim2011 - lim2011/10),
           label = paste0('beta = ', round(mod$coefficients[2,1], 2), '
p-value = ', round(mod$coefficients[2,4], 2)), size = 8) +
  theme_minimal() +
  theme(text = element_text(size=16)) +
  labs(x = "Quantity - 2009", y = "Quantity - 2011")

dev.off()



#####
## calcualte churn rates
#####

grid$imp.2009 <- ifelse(grid$qty.2009>0, 1, 0)
grid$imp.2011 <- ifelse(grid$qty.2011>0, 1, 0)

## all firm-product dyads

tab <- data.table(grid)
tab <- tab[,lapply(.SD, FUN = sum, na.rm = T), by = c('imp.2009', 'imp.2011'),
           .SDcols = c('value.2009', 'qty.2009', 'value.2011', 'qty.2011')]
tab <- as.data.frame(tab)

## pct of value in 2009 that still imports same prod in 2011
tab[tab$imp.2009==1 & tab$imp.2011==1, 'value.2009'] / (tab[tab$imp.2009==1 & tab$imp.2011==1, 'value.2009'] + tab[tab$imp.2009==1 & tab$imp.2011==0, 'value.2009'])
## pct of quantity in 2009 that still imports same prod in 2011
tab[tab$imp.2009==1 & tab$imp.2011==1, 'qty.2009'] / (tab[tab$imp.2009==1 & tab$imp.2011==1, 'qty.2009'] + tab[tab$imp.2009==1 & tab$imp.2011==0, 'qty.2009'])

## pct of value in 2011 that imported same prod in 2009
tab[tab$imp.2009==1 & tab$imp.2011==1, 'value.2011'] / (tab[tab$imp.2009==0 & tab$imp.2011==1, 'value.2011'] + tab[tab$imp.2009==1 & tab$imp.2011==1, 'value.2011'])
## pct of value in 2009 that still imports same prod in 2011
tab[tab$imp.2009==1 & tab$imp.2011==1, 'qty.2011'] / (tab[tab$imp.2009==0 & tab$imp.2011==1, 'qty.2011'] + tab[tab$imp.2009==1 & tab$imp.2011==1, 'qty.2011'])



## firm-product dyads matched to fams

grid2 <- subset(grid, grid$fam_known==1)
tab <- data.table(grid2)
tab <- tab[,lapply(.SD, FUN = sum, na.rm = T), by = c('imp.2009', 'imp.2011'),
           .SDcols = c('value.2009', 'qty.2009', 'value.2011', 'qty.2011')]
tab <- as.data.frame(tab)

## pct of value in 2009 that still imports same prod in 2011
tab[tab$imp.2009==1 & tab$imp.2011==1, 'value.2009'] / (tab[tab$imp.2009==1 & tab$imp.2011==1, 'value.2009'] + tab[tab$imp.2009==1 & tab$imp.2011==0, 'value.2009'])
## pct of quantity in 2009 that still imports same prod in 2011
tab[tab$imp.2009==1 & tab$imp.2011==1, 'qty.2009'] / (tab[tab$imp.2009==1 & tab$imp.2011==1, 'qty.2009'] + tab[tab$imp.2009==1 & tab$imp.2011==0, 'qty.2009'])

## pct of value in 2011 that imported same prod in 2009
tab[tab$imp.2009==1 & tab$imp.2011==1, 'value.2011'] / (tab[tab$imp.2009==0 & tab$imp.2011==1, 'value.2011'] + tab[tab$imp.2009==1 & tab$imp.2011==1, 'value.2011'])
## pct of value in 2009 that still imports same prod in 2011
tab[tab$imp.2009==1 & tab$imp.2011==1, 'qty.2011'] / (tab[tab$imp.2009==0 & tab$imp.2011==1, 'qty.2011'] + tab[tab$imp.2009==1 & tab$imp.2011==1, 'qty.2011'])


## firm-product dyads matched to fams and in CPI products

grid3 <- subset(grid, grid$fam_known==1 & grid$cpiprod==1)
tab <- data.table(grid3)
tab <- tab[,lapply(.SD, FUN = sum, na.rm = T), by = c('imp.2009', 'imp.2011'),
           .SDcols = c('value.2009', 'qty.2009', 'wgt_kg.2009', 'value.2011', 'qty.2011', 'wgt_kg.2011')]
tab <- as.data.frame(tab)

## pct of value in 2009 that still imports same prod in 2011
tab[tab$imp.2009==1 & tab$imp.2011==1, 'value.2009'] / (tab[tab$imp.2009==1 & tab$imp.2011==1, 'value.2009'] + tab[tab$imp.2009==1 & tab$imp.2011==0, 'value.2009'])
## pct of quantity in 2009 that still imports same prod in 2011
tab[tab$imp.2009==1 & tab$imp.2011==1, 'qty.2009'] / (tab[tab$imp.2009==1 & tab$imp.2011==1, 'qty.2009'] + tab[tab$imp.2009==1 & tab$imp.2011==0, 'qty.2009'])
## pct of wgt in 2009 that still imports same prod in 2011
tab[tab$imp.2009==1 & tab$imp.2011==1, 'wgt_kg.2009'] / (tab[tab$imp.2009==1 & tab$imp.2011==1, 'wgt_kg.2009'] + tab[tab$imp.2009==1 & tab$imp.2011==0, 'wgt_kg.2009'])

## pct of value in 2011 that imported same prod in 2009
tab[tab$imp.2009==1 & tab$imp.2011==1, 'value.2011'] / (tab[tab$imp.2009==0 & tab$imp.2011==1, 'value.2011'] + tab[tab$imp.2009==1 & tab$imp.2011==1, 'value.2011'])
## pct of value in 2011 that imported same prod in 2009
tab[tab$imp.2009==1 & tab$imp.2011==1, 'qty.2011'] / (tab[tab$imp.2009==0 & tab$imp.2011==1, 'qty.2011'] + tab[tab$imp.2009==1 & tab$imp.2011==1, 'qty.2011'])
## pct of wgt in 2011 that imported same prod in 2009
tab[tab$imp.2009==1 & tab$imp.2011==1, 'wgt_kg.2011'] / (tab[tab$imp.2009==0 & tab$imp.2011==1, 'wgt_kg.2011'] + tab[tab$imp.2009==1 & tab$imp.2011==1, 'wgt_kg.2011'])



#####
## calculate correlations
#####

## all data

cor(grid$value.2009, grid$value.2011, use = 'complete.obs')
cor(grid$value_log.2009, grid$value_log.2011, use = 'complete.obs')

## families identified

grid2 <- subset(grid, grid$fam_known==1)
cor(grid2$value.2009, grid2$value.2011, use = 'complete.obs')
cor(grid2$value_log.2009, grid2$value_log.2011, use = 'complete.obs')

## CPI sample

grid3 <- subset(grid, grid$cpiprod==1 & grid$fam_known==1)
cor(grid3$value.2009, grid3$value.2011, use = 'complete.obs')
cor(grid3$value_log.2009, grid3$value_log.2011, use = 'complete.obs')


